Here we do exploratory data analysis on HDMA data obtained for Pennsylvania in the year 2014. We will start from looking at the data superficially and then diving into columns of interest. Then we see for any missing values and handle them. Lets get started with the steps. ## Global setup like working directory, data directory etc should happen here.
library(sys)
working_directory <- getwd()
setwd(dirname(dirname(working_directory)))
The working directory was changed to /Users/omkarpawar/Desktop/csp-571-02-final-project/src inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
writeLines("")
getwd()
[1] "/Users/omkarpawar/Desktop/csp-571-02-final-project/src"
data_dir <- "/Users/omkarpawar/Desktop/Data/PA/"
# https://stackoverflow.com/questions/4090169/elegant-way-to-check-for-missing-packages-and-install-them
list_of_packages <- c("mlbench", "corrplot", "rvest", "tidyr", "stringr", "dplyr", "lubridate", "data.table", "mice", "scales", "naniar", "rpart", "rpart.plot", "caret", "moments")
new.packages <- list_of_packages[!(list_of_packages %in% installed.packages()[,"Package"])]
if (length(new.packages)) {
print("Installing packages\n")
install.packages(new.packages())
}
library(corrplot)
library(ggplot2)
library(tidyr)
library(stringr)
library(dplyr)
library(data.table)
library(mice)
library(rstudioapi)
library(naniar)
library(moments)
source(paste(dirname(dirname(dirname(rstudioapi::getActiveDocumentContext()$path))), "utils/utils.r", sep="/"))
source(paste(dirname(dirname(dirname(rstudioapi::getActiveDocumentContext()$path))), "utils/model_utils.r", sep="/"))
hmda_data_pa <- fread(paste(data_dir, "hmda_2014_pa_all-records_labels.csv", sep=""))
|--------------------------------------------------|
|==================================================|
Lets see first few rows of our data and what they tell about the application.
hmda_data_pa_df <- as.data.frame(hmda_data_pa)
head(hmda_data_pa_df)
We know that there are different types of loans. Lets see how is their distrubution.
ggplot(data = summarise_at(group_by(hmda_data_pa_df,loan_type_name),vars(loan_type),funs(n())),aes(x = loan_type_name,y = loan_type)) + geom_bar(stat = "identity",fill = "#009E73") + geom_text(aes(label = loan_type), vjust = -0.5) +labs(title = "Type of Loans Distribution" , x = "Loan Type" , y = "Count")
Its pretty clear that conventinal type of loans recieve most applications. This is even the focus of this project. ### Filter out conventional loans. Then we print column names for the data.
# Filter to include conventional loans only.
hmda_data_pa_df <- hmda_data_pa_df[hmda_data_pa_df$loan_type == "1", ]
colnames(hmda_data_pa_df)
[1] "as_of_year" "respondent_id"
[3] "agency_name" "agency_abbr"
[5] "agency_code" "loan_type_name"
[7] "loan_type" "property_type_name"
[9] "property_type" "loan_purpose_name"
[11] "loan_purpose" "owner_occupancy_name"
[13] "owner_occupancy" "loan_amount_000s"
[15] "preapproval_name" "preapproval"
[17] "action_taken_name" "action_taken"
[19] "msamd_name" "msamd"
[21] "state_name" "state_abbr"
[23] "state_code" "county_name"
[25] "county_code" "census_tract_number"
[27] "applicant_ethnicity_name" "applicant_ethnicity"
[29] "co_applicant_ethnicity_name" "co_applicant_ethnicity"
[31] "applicant_race_name_1" "applicant_race_1"
[33] "applicant_race_name_2" "applicant_race_2"
[35] "applicant_race_name_3" "applicant_race_3"
[37] "applicant_race_name_4" "applicant_race_4"
[39] "applicant_race_name_5" "applicant_race_5"
[41] "co_applicant_race_name_1" "co_applicant_race_1"
[43] "co_applicant_race_name_2" "co_applicant_race_2"
[45] "co_applicant_race_name_3" "co_applicant_race_3"
[47] "co_applicant_race_name_4" "co_applicant_race_4"
[49] "co_applicant_race_name_5" "co_applicant_race_5"
[51] "applicant_sex_name" "applicant_sex"
[53] "co_applicant_sex_name" "co_applicant_sex"
[55] "applicant_income_000s" "purchaser_type_name"
[57] "purchaser_type" "denial_reason_name_1"
[59] "denial_reason_1" "denial_reason_name_2"
[61] "denial_reason_2" "denial_reason_name_3"
[63] "denial_reason_3" "rate_spread"
[65] "hoepa_status_name" "hoepa_status"
[67] "lien_status_name" "lien_status"
[69] "edit_status_name" "edit_status"
[71] "sequence_number" "population"
[73] "minority_population" "hud_median_family_income"
[75] "tract_to_msamd_income" "number_of_owner_occupied_units"
[77] "number_of_1_to_4_family_units" "application_date_indicator"
writeLines("")
head(hmda_data_pa_df, 10)
NA
We see that there are a lot of redundant columns in our data. We might need to remove these columns while modelling our data and use codes instead of strings for training the model.
Print glimpse of dataset i.e a vertical preview of the dataset.
dim(hmda_data_pa_df)
[1] 322785 78
writeLines("Glimpse of hmda dataset for PA")
Glimpse of hmda dataset for PA
glimpse(hmda_data_pa_df)
Observations: 322,785
Variables: 78
$ as_of_year [3m[38;5;246m<int>[39m[23m 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, …
$ respondent_id [3m[38;5;246m<chr>[39m[23m "41-1795868", "0000024891", "62-1532940", "41-1795868", "000…
$ agency_name [3m[38;5;246m<chr>[39m[23m "Department of Housing and Urban Development", "Office of th…
$ agency_abbr [3m[38;5;246m<chr>[39m[23m "HUD", "OCC", "HUD", "HUD", "FDIC", "HUD", "FDIC", "NCUA", "…
$ agency_code [3m[38;5;246m<int>[39m[23m 7, 1, 7, 7, 3, 7, 3, 5, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, …
$ loan_type_name [3m[38;5;246m<chr>[39m[23m "Conventional", "Conventional", "Conventional", "Conventiona…
$ loan_type [3m[38;5;246m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ property_type_name [3m[38;5;246m<chr>[39m[23m "One-to-four family dwelling (other than manufactured housin…
$ property_type [3m[38;5;246m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ loan_purpose_name [3m[38;5;246m<chr>[39m[23m "Refinancing", "Home purchase", "Home purchase", "Refinancin…
$ loan_purpose [3m[38;5;246m<int>[39m[23m 3, 1, 1, 3, 3, 1, 1, 3, 3, 3, 3, 1, 3, 1, 3, 1, 3, 3, 3, 3, …
$ owner_occupancy_name [3m[38;5;246m<chr>[39m[23m "Owner-occupied as a principal dwelling", "Owner-occupied as…
$ owner_occupancy [3m[38;5;246m<int>[39m[23m 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, …
$ loan_amount_000s [3m[38;5;246m<int>[39m[23m 81, 132, 158, 236, 53, 150, 108, 265, 196, 67, 185, 42, 179,…
$ preapproval_name [3m[38;5;246m<chr>[39m[23m "Not applicable", "Not applicable", "Preapproval was not req…
$ preapproval [3m[38;5;246m<int>[39m[23m 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
$ action_taken_name [3m[38;5;246m<chr>[39m[23m "Application denied by financial institution", "Loan origina…
$ action_taken [3m[38;5;246m<int>[39m[23m 3, 1, 3, 4, 1, 6, 1, 1, 3, 1, 1, 6, 2, 1, 4, 6, 3, 1, 5, 6, …
$ msamd_name [3m[38;5;246m<chr>[39m[23m "", "York, Hanover - PA", "Lancaster - PA", "Montgomery Coun…
$ msamd [3m[38;5;246m<int>[39m[23m NA, 49620, 29540, 33874, 10900, 33874, NA, 33874, NA, 25420,…
$ state_name [3m[38;5;246m<chr>[39m[23m "Pennsylvania", "Pennsylvania", "Pennsylvania", "Pennsylvani…
$ state_abbr [3m[38;5;246m<chr>[39m[23m "PA", "PA", "PA", "PA", "PA", "PA", "PA", "PA", "PA", "PA", …
$ state_code [3m[38;5;246m<int>[39m[23m 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, …
$ county_name [3m[38;5;246m<chr>[39m[23m "Lawrence County", "York County", "Lancaster County", "Bucks…
$ county_code [3m[38;5;246m<int>[39m[23m 73, 133, 71, 17, 95, 17, 47, 29, 107, 43, 55, 45, 11, 17, 10…
$ census_tract_number [3m[38;5;246m<dbl>[39m[23m 111.00, 207.10, 117.04, 1016.03, 162.02, 1055.10, 9511.00, 3…
$ applicant_ethnicity_name [3m[38;5;246m<chr>[39m[23m "Not Hispanic or Latino", "Not Hispanic or Latino", "Not His…
$ applicant_ethnicity [3m[38;5;246m<int>[39m[23m 2, 2, 2, 2, 2, 4, 2, 2, 3, 2, 2, 4, 2, 3, 2, 4, 2, 2, 3, 4, …
$ co_applicant_ethnicity_name [3m[38;5;246m<chr>[39m[23m "No co-applicant", "No co-applicant", "No co-applicant", "No…
$ co_applicant_ethnicity [3m[38;5;246m<int>[39m[23m 5, 5, 5, 5, 5, 4, 5, 2, 3, 5, 5, 5, 5, 3, 5, 4, 2, 5, 5, 4, …
$ applicant_race_name_1 [3m[38;5;246m<chr>[39m[23m "White", "White", "White", "Black or African American", "Whi…
$ applicant_race_1 [3m[38;5;246m<int>[39m[23m 5, 5, 5, 3, 5, 7, 5, 5, 6, 5, 5, 7, 3, 6, 3, 7, 5, 3, 6, 7, …
$ applicant_race_name_2 [3m[38;5;246m<chr>[39m[23m "", "", "", "White", "", "", "", "", "", "", "", "", "", "",…
$ applicant_race_2 [3m[38;5;246m<int>[39m[23m NA, NA, NA, 5, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ applicant_race_name_3 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ applicant_race_3 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ applicant_race_name_4 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ applicant_race_4 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ applicant_race_name_5 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ applicant_race_5 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ co_applicant_race_name_1 [3m[38;5;246m<chr>[39m[23m "No co-applicant", "No co-applicant", "No co-applicant", "No…
$ co_applicant_race_1 [3m[38;5;246m<int>[39m[23m 8, 8, 8, 8, 8, 7, 8, 5, 6, 8, 8, 8, 8, 6, 8, 7, 5, 8, 8, 7, …
$ co_applicant_race_name_2 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ co_applicant_race_2 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ co_applicant_race_name_3 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ co_applicant_race_3 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ co_applicant_race_name_4 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ co_applicant_race_4 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ co_applicant_race_name_5 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ co_applicant_race_5 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ applicant_sex_name [3m[38;5;246m<chr>[39m[23m "Male", "Male", "Female", "Female", "Female", "Not applicabl…
$ applicant_sex [3m[38;5;246m<int>[39m[23m 1, 1, 2, 2, 2, 4, 1, 1, 3, 2, 1, 4, 1, 3, 1, 4, 2, 2, 3, 4, …
$ co_applicant_sex_name [3m[38;5;246m<chr>[39m[23m "No co-applicant", "No co-applicant", "No co-applicant", "No…
$ co_applicant_sex [3m[38;5;246m<int>[39m[23m 5, 5, 5, 5, 5, 4, 5, 2, 3, 5, 5, 5, 5, 3, 5, 4, 1, 5, 5, 4, …
$ applicant_income_000s [3m[38;5;246m<int>[39m[23m 30, 49, 76, 105, 53, NA, 36, 145, NA, 19, 48, NA, 80, 146, 9…
$ purchaser_type_name [3m[38;5;246m<chr>[39m[23m "Loan was not originated or was not sold in calendar year co…
$ purchaser_type [3m[38;5;246m<int>[39m[23m 0, 3, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, …
$ denial_reason_name_1 [3m[38;5;246m<chr>[39m[23m "Other", "", "Collateral", "", "", "", "", "", "", "", "", "…
$ denial_reason_1 [3m[38;5;246m<int>[39m[23m 9, NA, 4, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ denial_reason_name_2 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ denial_reason_2 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ denial_reason_name_3 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ denial_reason_3 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ rate_spread [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ hoepa_status_name [3m[38;5;246m<chr>[39m[23m "Not a HOEPA loan", "Not a HOEPA loan", "Not a HOEPA loan", …
$ hoepa_status [3m[38;5;246m<int>[39m[23m 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
$ lien_status_name [3m[38;5;246m<chr>[39m[23m "Secured by a first lien", "Secured by a first lien", "Secur…
$ lien_status [3m[38;5;246m<int>[39m[23m 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 4, 1, 1, 1, 4, 1, 1, 1, 4, …
$ edit_status_name [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "Quality edit failure only",…
$ edit_status [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, 6, NA, NA, NA, NA, NA, NA, N…
$ sequence_number [3m[38;5;246m<int>[39m[23m 100583, 2869, 22604, 96739, 339, 15020, 294, 689, 180055, 10…
$ population [3m[38;5;246m<int>[39m[23m 4182, 6517, 4340, 3193, 4351, 4748, 5669, 6319, 2186, 1824, …
$ minority_population [3m[38;5;246m<dbl>[39m[23m 2.53, 10.74, 14.19, 15.31, 6.07, 18.60, 1.94, 11.00, 4.39, 2…
$ hud_median_family_income [3m[38;5;246m<int>[39m[23m 54100, 68700, 67200, 95400, 68800, 95400, 54100, 95400, 5410…
$ tract_to_msamd_income [3m[38;5;246m<dbl>[39m[23m 96.77, 93.10, 161.91, 69.79, 102.17, 107.64, 106.73, 135.67,…
$ number_of_owner_occupied_units [3m[38;5;246m<int>[39m[23m 1261, 2130, 1362, 774, 1329, 1603, 1822, 1964, 552, 590, 215…
$ number_of_1_to_4_family_units [3m[38;5;246m<int>[39m[23m 1652, 2541, 1533, 1078, 1608, 1945, 2265, 1972, 901, 666, 24…
$ application_date_indicator [3m[38;5;246m<int>[39m[23m 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, …
Now, lets look at the missing values that are present in our data. We go through this in 4 steps. First we look for any NAs, then empty string, NULL values and at last we look for missing values encoded as “?”
writeLines("Checking for missing values with NA")
Checking for missing values with NA
sapply(hmda_data_pa_df, function(x) sum(is.na(x)))
as_of_year respondent_id agency_name
0 0 0
agency_abbr agency_code loan_type_name
0 0 0
loan_type property_type_name property_type
0 0 0
loan_purpose_name loan_purpose owner_occupancy_name
0 0 0
owner_occupancy loan_amount_000s preapproval_name
0 0 0
preapproval action_taken_name action_taken
0 0 0
msamd_name msamd state_name
0 28898 0
state_abbr state_code county_name
0 0 0
county_code census_tract_number applicant_ethnicity_name
442 925 0
applicant_ethnicity co_applicant_ethnicity_name co_applicant_ethnicity
0 0 0
applicant_race_name_1 applicant_race_1 applicant_race_name_2
0 0 0
applicant_race_2 applicant_race_name_3 applicant_race_3
321968 0 322717
applicant_race_name_4 applicant_race_4 applicant_race_name_5
0 322750 0
applicant_race_5 co_applicant_race_name_1 co_applicant_race_1
322753 0 0
co_applicant_race_name_2 co_applicant_race_2 co_applicant_race_name_3
0 322511 0
co_applicant_race_3 co_applicant_race_name_4 co_applicant_race_4
322763 0 322776
co_applicant_race_name_5 co_applicant_race_5 applicant_sex_name
0 322777 0
applicant_sex co_applicant_sex_name co_applicant_sex
0 0 0
applicant_income_000s purchaser_type_name purchaser_type
24218 0 0
denial_reason_name_1 denial_reason_1 denial_reason_name_2
0 275432 0
denial_reason_2 denial_reason_name_3 denial_reason_3
313186 0 321434
rate_spread hoepa_status_name hoepa_status
317302 0 0
lien_status_name lien_status edit_status_name
0 0 0
edit_status sequence_number population
270997 0 927
minority_population hud_median_family_income tract_to_msamd_income
935 925 991
number_of_owner_occupied_units number_of_1_to_4_family_units application_date_indicator
963 945 0
writeLines("Checking for missing values with empty strings")
Checking for missing values with empty strings
sapply(hmda_data_pa_df, function(x) sum(x == ""))
as_of_year respondent_id agency_name
0 0 0
agency_abbr agency_code loan_type_name
0 0 0
loan_type property_type_name property_type
0 0 0
loan_purpose_name loan_purpose owner_occupancy_name
0 0 0
owner_occupancy loan_amount_000s preapproval_name
0 0 0
preapproval action_taken_name action_taken
0 0 0
msamd_name msamd state_name
28898 NA 0
state_abbr state_code county_name
0 0 442
county_code census_tract_number applicant_ethnicity_name
NA NA 0
applicant_ethnicity co_applicant_ethnicity_name co_applicant_ethnicity
0 0 0
applicant_race_name_1 applicant_race_1 applicant_race_name_2
0 0 321968
applicant_race_2 applicant_race_name_3 applicant_race_3
NA 322717 NA
applicant_race_name_4 applicant_race_4 applicant_race_name_5
322750 NA 322753
applicant_race_5 co_applicant_race_name_1 co_applicant_race_1
NA 0 0
co_applicant_race_name_2 co_applicant_race_2 co_applicant_race_name_3
322511 NA 322763
co_applicant_race_3 co_applicant_race_name_4 co_applicant_race_4
NA 322776 NA
co_applicant_race_name_5 co_applicant_race_5 applicant_sex_name
322777 NA 0
applicant_sex co_applicant_sex_name co_applicant_sex
0 0 0
applicant_income_000s purchaser_type_name purchaser_type
NA 0 0
denial_reason_name_1 denial_reason_1 denial_reason_name_2
275432 NA 313186
denial_reason_2 denial_reason_name_3 denial_reason_3
NA 321434 NA
rate_spread hoepa_status_name hoepa_status
NA 0 0
lien_status_name lien_status edit_status_name
0 0 270997
edit_status sequence_number population
NA 0 NA
minority_population hud_median_family_income tract_to_msamd_income
NA NA NA
number_of_owner_occupied_units number_of_1_to_4_family_units application_date_indicator
NA NA 0
writeLines("Checking for missing values with ?")
Checking for missing values with ?
sapply(hmda_data_pa_df, function(x) sum(x == "?"))
as_of_year respondent_id agency_name
0 0 0
agency_abbr agency_code loan_type_name
0 0 0
loan_type property_type_name property_type
0 0 0
loan_purpose_name loan_purpose owner_occupancy_name
0 0 0
owner_occupancy loan_amount_000s preapproval_name
0 0 0
preapproval action_taken_name action_taken
0 0 0
msamd_name msamd state_name
0 NA 0
state_abbr state_code county_name
0 0 0
county_code census_tract_number applicant_ethnicity_name
NA NA 0
applicant_ethnicity co_applicant_ethnicity_name co_applicant_ethnicity
0 0 0
applicant_race_name_1 applicant_race_1 applicant_race_name_2
0 0 0
applicant_race_2 applicant_race_name_3 applicant_race_3
NA 0 NA
applicant_race_name_4 applicant_race_4 applicant_race_name_5
0 NA 0
applicant_race_5 co_applicant_race_name_1 co_applicant_race_1
NA 0 0
co_applicant_race_name_2 co_applicant_race_2 co_applicant_race_name_3
0 NA 0
co_applicant_race_3 co_applicant_race_name_4 co_applicant_race_4
NA 0 NA
co_applicant_race_name_5 co_applicant_race_5 applicant_sex_name
0 NA 0
applicant_sex co_applicant_sex_name co_applicant_sex
0 0 0
applicant_income_000s purchaser_type_name purchaser_type
NA 0 0
denial_reason_name_1 denial_reason_1 denial_reason_name_2
0 NA 0
denial_reason_2 denial_reason_name_3 denial_reason_3
NA 0 NA
rate_spread hoepa_status_name hoepa_status
NA 0 0
lien_status_name lien_status edit_status_name
0 0 0
edit_status sequence_number population
NA 0 NA
minority_population hud_median_family_income tract_to_msamd_income
NA NA NA
number_of_owner_occupied_units number_of_1_to_4_family_units application_date_indicator
NA NA 0
writeLines("Checking for missing values with null")
Checking for missing values with null
sapply(hmda_data_pa_df, function(x) sum(x == NULL))
as_of_year respondent_id agency_name
0 0 0
agency_abbr agency_code loan_type_name
0 0 0
loan_type property_type_name property_type
0 0 0
loan_purpose_name loan_purpose owner_occupancy_name
0 0 0
owner_occupancy loan_amount_000s preapproval_name
0 0 0
preapproval action_taken_name action_taken
0 0 0
msamd_name msamd state_name
0 0 0
state_abbr state_code county_name
0 0 0
county_code census_tract_number applicant_ethnicity_name
0 0 0
applicant_ethnicity co_applicant_ethnicity_name co_applicant_ethnicity
0 0 0
applicant_race_name_1 applicant_race_1 applicant_race_name_2
0 0 0
applicant_race_2 applicant_race_name_3 applicant_race_3
0 0 0
applicant_race_name_4 applicant_race_4 applicant_race_name_5
0 0 0
applicant_race_5 co_applicant_race_name_1 co_applicant_race_1
0 0 0
co_applicant_race_name_2 co_applicant_race_2 co_applicant_race_name_3
0 0 0
co_applicant_race_3 co_applicant_race_name_4 co_applicant_race_4
0 0 0
co_applicant_race_name_5 co_applicant_race_5 applicant_sex_name
0 0 0
applicant_sex co_applicant_sex_name co_applicant_sex
0 0 0
applicant_income_000s purchaser_type_name purchaser_type
0 0 0
denial_reason_name_1 denial_reason_1 denial_reason_name_2
0 0 0
denial_reason_2 denial_reason_name_3 denial_reason_3
0 0 0
rate_spread hoepa_status_name hoepa_status
0 0 0
lien_status_name lien_status edit_status_name
0 0 0
edit_status sequence_number population
0 0 0
minority_population hud_median_family_income tract_to_msamd_income
0 0 0
number_of_owner_occupied_units number_of_1_to_4_family_units application_date_indicator
0 0 0
First, we look at race and ethnicity columns and see what information they provide and how is the distribution per variable.
library(janitor)
writeLines("")
writeLines("Application ethnicity values")
Application ethnicity values
unique(hmda_data_pa_df$applicant_ethnicity_name)
[1] "Not Hispanic or Latino"
[2] "Not applicable"
[3] "Information not provided by applicant in mail, Internet, or telephone application"
[4] "Hispanic or Latino"
writeLines("")
writeLines("Application race name 1 values")
Application race name 1 values
unique(hmda_data_pa_df$applicant_race_1)
[1] 5 3 7 6 2 4 1
unique(hmda_data_pa_df$applicant_race_name_1)
[1] "White"
[2] "Black or African American"
[3] "Not applicable"
[4] "Information not provided by applicant in mail, Internet, or telephone application"
[5] "Asian"
[6] "Native Hawaiian or Other Pacific Islander"
[7] "American Indian or Alaska Native"
Now, lets group the dataframe by ethnicity not Hispanic and print the count according to race.
grouped_by_race_info <- hmda_data_pa_df %>% filter(applicant_ethnicity_name == "Hispanic or Latino") %>%
group_by(applicant_race_name_1) %>%
count() %>%
ungroup() %>%
replace(is.na(.), 0) %>%
adorn_totals(c("col")) %>%
arrange(-Total)
head(grouped_by_race_info)
applicant_race_name_1 n Total
White 6777 6777
Information not provided by applicant in mail, Internet, or telephone application 1192 1192
Black or African American 447 447
Native Hawaiian or Other Pacific Islander 205 205
American Indian or Alaska Native 204 204
Asian 74 74
We do this because we want to merge these two columns into one and deal with it as one single predictor.
hmda_data_pa_df$applicant_race_and_ethnicity <- NA
hmda_data_pa_df$co_applicant_race_and_ethnicity <- NA
hmda_data_pa_df$applicant_race_and_ethnicity <- ifelse(hmda_data_pa_df$applicant_ethnicity_name == "Hispanic or Latino",
"Hispanic or Latino", hmda_data_pa_df$applicant_race_name_1)
hmda_data_pa_df$co_applicant_race_and_ethnicity <- ifelse(hmda_data_pa_df$co_applicant_ethnicity_name == "Hispanic or Latino",
"Hispanic or Latino", hmda_data_pa_df$co_applicant_race_name_1)
writeLines("")
writeLines("Unique values for the applicant_race_and_ethnicity column")
Unique values for the applicant_race_and_ethnicity column
writeLines("")
unique(hmda_data_pa_df$applicant_race_and_ethnicity)
[1] "White"
[2] "Black or African American"
[3] "Not applicable"
[4] "Information not provided by applicant in mail, Internet, or telephone application"
[5] "Hispanic or Latino"
[6] "Asian"
[7] "Native Hawaiian or Other Pacific Islander"
[8] "American Indian or Alaska Native"
hmda_data_pa_df$applicant_race_and_ethnicity[hmda_data_pa_df$applicant_race_and_ethnicity == "Information not provided by applicant in mail, Internet, or telephone application"] <- "No Information Provided"
head(hmda_data_pa_df)
NA
See how the distroibution is for the loan application according to race and ethnicity. We summarise the count of application according to the applicants race.
mortgage_by_race_and_ethnicity = hmda_data_pa_df %>% group_by(applicant_race_and_ethnicity) %>%
summarise(EthnicityCount = n()) %>%
arrange(desc(EthnicityCount))
graph_by_enthicity(mortgage_by_race_and_ethnicity)
THe barchart shows that there are more applications made by White popuplation, which is justifiable as the US has majority of White population. Now, lets dive even deeper and see how the actions are taken for application for each race and ethnicity category. # Graph which applicant races and ethnicities have the largest proportion of loans # in various stages. These include origination status, denied status, etc.
mortgage_status_by_race_and_ethnicity <- hmda_data_pa_df %>% group_by(action_taken_name, applicant_race_and_ethnicity) %>%
summarise(ActionCount = n()) %>%
arrange(desc(ActionCount))
mortgage_status_aggregated_by_race_and_ethnicity = inner_join(mortgage_status_by_race_and_ethnicity, mortgage_by_race_and_ethnicity) %>% mutate(percentage = (ActionCount / EthnicityCount) * 100)
Joining, by = "applicant_race_and_ethnicity"
graph_application_race_proportion_of_loans(mortgage_status_aggregated_by_race_and_ethnicity)
The graph above clearly shows that the denial rate is more for minorities, and to be more specific, it is more for African Americans. One more thing to notice is that the category where applicants race is unknown, most of them are purchased by the institution.
Now lets see how the income distriubtion underlies for applicants. Lets see the median income for each category.
m <- summarise_at(group_by(hmda_data_pa_df,applicant_race_and_ethnicity),vars(applicant_income_000s),funs(median(.,na.rm=TRUE)))
ggplot(data = m ,aes (x = reorder(applicant_race_and_ethnicity,applicant_income_000s),y = applicant_income_000s)) + geom_bar(stat = "identity",fill = "steelblue") +geom_text(aes(label=applicant_income_000s), size=3 , nudge_y = 5)+ coord_flip()+labs(title = "Median of applicants income by Race and Ethnicity", y="Median Income",x="Race and Ethnicity")
We see that Asians have the largest median income value amongst all. At the bottom, we have African Americans and Hispanic or Latino
Lets go ahead and see the distribution for inccome for each category. We do this by plotting a boxplot.
p <- ggplot(group_by(hmda_data_pa_df,applicant_race_and_ethnicity), aes(x=applicant_race_and_ethnicity, y=applicant_income_000s)) +
geom_boxplot() + labs(title = "Applicants Income According to Race And Ethnicity" , x = "Applicant Race and Ethnicity" , y = "Income distribution in thousands")
p +coord_flip()
In this boxplot we can see that there are a lot of outliers in the data and income is very much spread out. It could be a case that the income values are highly skewed.
Now we filter the data for originated applications and see how is the distribution.
hmda_origination_status_df <- hmda_data_pa_df[hmda_data_pa_df$action_taken == "1", ]
m <- summarise_at(group_by(hmda_origination_status_df,applicant_race_and_ethnicity),vars(applicant_income_000s),funs(median(.,na.rm=TRUE)))
ggplot(data = m ,aes (x = reorder(applicant_race_and_ethnicity,applicant_income_000s),y = applicant_income_000s)) + geom_bar(stat = "identity",fill = "steelblue") +geom_text(aes(label=applicant_income_000s), size=3 , nudge_y = 5)+ coord_flip()+labs(title = "Median of applicants income by Race and Ethnicity", y="Median Income",x="Race and Ethnicity")
The median applicant income for each category is more for this data. Even this is understandable as the applicants with more income get the loan apporved easily. That is not everything to consider but its one of the condition.
p <- ggplot(group_by(hmda_origination_status_df,applicant_race_and_ethnicity), aes(x=applicant_race_and_ethnicity, y=applicant_income_000s)) +
geom_boxplot() + labs(title = "Approved Loan Applicant's Income Distribution" , x = "Applicant Race and Ethnicity" , y = "Income distribution in thousands")
p +coord_flip()
We now see the overall distribution using histograms.
graph_applicant_income_histogram(hmda_origination_status_df, "Applicant income distribution for originated loans")
Here we see that the applicant income is right skewed. We would need to transform these kind of variables if we are fitting them to the model that assumes normality.
Now we see the distribution of applicant income where their income amount is less than 500K.
temp<-as.data.frame(hmda_origination_status_df$applicant_income_000s[hmda_origination_status_df$applicant_income_000s<500])
ggplot(data = temp, aes(x=hmda_origination_status_df$applicant_income_000s[hmda_origination_status_df$applicant_income_000s < 500])) +geom_histogram(fill = "steelblue") + labs(title = "Applicant with loan approved and income below 500K" , x = "Income in thousands",y="Count")
Now lest see the proportion of loans that we granted and denied. The action that was taken on the application is stored in action_taken and has many values. We need to classify them to two classes as Granted or Not Granted. So we make a new column, Decision which has the values as Approved or Not. We also add one value where it says Paperwork error to record the applications which we close due to incomplete paperwork or any other reasons.
hmda_data_pa_df$Decision <- NA
hmda_data_pa_df$Decision <- ifelse(hmda_data_pa_df$action_taken_name == "Application denied by financial institution" | hmda_data_pa_df$action_taken_name == "Preapproval request denied by financial institution" , "Denied" , ifelse(hmda_data_pa_df$action_taken_name == "Loan originated" | hmda_data_pa_df$action_taken_name == "Application approved but not accepted" , "Approved" , "Paperwork Issues"))
tbl <- with(hmda_data_pa_df, table(Decision,applicant_race_and_ethnicity))
ggplot(as.data.frame(tbl), aes(factor(applicant_race_and_ethnicity), Freq, fill = Decision)) +
geom_col(position = 'dodge') + labs(title = "Proportion of loans denied by financial Institutions",x = "Race and Ethnicity" , y = "Number of applicants") + coord_flip()
The approval proportion for whites is hight than that of other categories Lets now plot the action taken distribution with raw values and see if we can any different trend.
tbl <- with(hmda_data_pa_df, table(action_taken_name,applicant_race_and_ethnicity))
ggplot(as.data.frame(tbl), aes(factor(applicant_race_and_ethnicity), Freq, fill = action_taken_name)) +
geom_col(position = 'dodge') + labs(title = "Proportion of loans denied by financial Institutions",x = "Race and Ethnicity" , y = "Number of applicants")
Here too , we can see that the data tells approval rates for Whites is high. ## Graph loan distribution by county.
mortgage_distribution_by_counties <- hmda_data_pa_df %>%
filter(!is.na(county_name)) %>%
group_by(county_name) %>%
summarise(CountLoans = n() ) %>%
mutate(percentage = ( CountLoans / sum(CountLoans) ) * 100 ) %>%
mutate(county_name = reorder(county_name, percentage)) %>%
arrange(desc(percentage)) %>%
head(20)
graph_distribution_by_county(mortgage_distribution_by_counties)
originated_mortgage_distribution_by_counties <- hmda_origination_status_df %>%
filter(!is.na(county_name)) %>%
group_by(county_name) %>%
summarise(CountLoans = n() ) %>%
mutate(percentage = ( CountLoans / sum(CountLoans) ) *100 ) %>%
mutate(county_name = reorder(county_name, percentage)) %>%
arrange(desc(percentage)) %>%
head(20)
graph_distribution_by_county(originated_mortgage_distribution_by_counties)
county_names <- c("Allegheny County", "Philadelphia County", "Montgomery County", "Bucks County")
for (county_name in county_names) {
hmda_data_county_df <- hmda_data_pa_df[hmda_data_pa_df$county_name == county_name, ]
mortgage_by_race_county <- hmda_data_county_df %>% group_by(applicant_race_name_1) %>%
summarise(RaceCount = n()) %>% arrange(desc(RaceCount))
print(graph_mortgage_distribution_by_race1(mortgage_by_race_county))
}
for (county_name in county_names) {
hmda_origination_status_df_by_county_white <- hmda_data_pa_df[hmda_data_pa_df$action_taken == "1" & hmda_data_pa_df$county_name == county_name & hmda_data_pa_df$applicant_race_name_1 == "White", ]
print(graph_applicant_income_histogram(hmda_origination_status_df_by_county_white, "Income distribution for Whites"))
hmda_origination_status_df_by_county_african_american <- hmda_data_pa_df[hmda_data_pa_df$action_taken == "1" & hmda_data_pa_df$county_name == county_name & hmda_data_pa_df$applicant_race_name_1 == "Black or African American", ]
print(graph_applicant_income_histogram(hmda_origination_status_df_by_county_african_american, "Income distribution for African Americans"))
}
county_names <- c("Allegheny County", "Philadelphia County", "Montgomery County", "Bucks County")
for (county_name in county_names) {
hmda_data_county_df <- hmda_data_pa_df[hmda_data_pa_df$county_name == county_name, ]
mortgage_by_race_county <- hmda_data_county_df %>% group_by(applicant_race_and_ethnicity) %>%
summarise(RaceCount = n()) %>% arrange(desc(RaceCount))
print(graph_mortgage_distribution_by_race_and_ethnicity(mortgage_by_race_county))
}
for (county_name in county_names) {
hmda_data_county_df <- hmda_data_pa_df[hmda_data_pa_df$county_name == county_name, ]
mortgage_by_race1_county <- hmda_data_county_df %>% group_by(applicant_race_and_ethnicity) %>%
summarise(RaceCount = n()) %>% arrange(desc(RaceCount))
mortgage_status_by_race1_by_county <- hmda_data_county_df %>% group_by(action_taken_name, applicant_race_and_ethnicity) %>%
summarise(ActionCount = n()) %>%
arrange(desc(ActionCount))
mortgage_status_aggregated_by_race1_by_county = inner_join(mortgage_status_by_race1_by_county, mortgage_by_race1_county) %>% mutate(percentage = (ActionCount / RaceCount) * 100)
print(graph_application_race_and_ethnicity_proportion_of_loans(mortgage_status_aggregated_by_race1_by_county))
}
Joining, by = "applicant_race_and_ethnicity"
Now we start looking at the missing values and see how can we deal with them .So here, we try and vizualize the missing values
visualize_missing_values(hmda_data_pa_df)
In this graph, we see the missing value count for each column and for each category too. There are alot of missing in some columns like co applicant and applicant 2-3-4 race.
Now we try to impute the missing values. Easy way out here is to impute it with mice function. Its not the best but initially we go with this and see how it performs. # Impute as needed.
# https://www.rdocumentation.org/packages/mice/versions/3.8.0/topics/mice.impute.cart
hmda_data_pa_df_imputed <- mice(hmda_data_pa_df, m=1, maxit=2, meth='cart',seed=500)
hmda_data_pa_df_imputed <- mice::complete(hmda_data_pa_df_imputed)
summary(hmda_data_pa_df_imputed)
gg_miss_upset(hmda_data_pa_df_imputed)
We see how the variables in the data are correlated to each other. We do this by correlation plots.
Banks use this a lot of times when they have to look at how much the applicant income is how much loan they have applied to. So its a good variable to give the extra information about the application.
# hmda_data_pa_df_imputed <- hmda_data_pa_df;
# https://stackoverflow.com/questions/20637360/convert-all-data-frame-character-columns-to-factors
hmda_data_pa_df_imputed$loan_to_income_ratio <- hmda_data_pa_df_imputed$loan_amount_000s / hmda_data_pa_df_imputed$applicant_income_000s
hmda_data_pa_df_imputed[sapply(hmda_data_pa_df_imputed, is.character)] <- lapply(hmda_data_pa_df_imputed[sapply(hmda_data_pa_df_imputed, is.character)],
as.factor)
hmda_data_pa_df_imputed_for_correlation <- as.data.frame(lapply(hmda_data_pa_df_imputed, as.integer))
head(hmda_data_pa_df_imputed_for_correlation[, c("applicant_income_000s", "loan_amount_000s")])
corr_simple(hmda_data_pa_df_imputed_for_correlation)
corrplot(cor(hmda_data_pa_df_imputed_for_correlation[, c("applicant_income_000s", "loan_amount_000s")], use = "na.or.complete"))
hmda_model_df <- hmda_data_frame_for_model(hmda_data_pa_df_imputed)
Error in hmda_data_frame_for_model(hmda_data_pa_df_imputed) :
object 'hmda_data_pa_df_imputed' not found
l <- ggplot(hmda_model_df, aes(loan_purpose, fill = loan_granted))
l <- l + geom_histogram(stat="count") + coord_flip()
print(l)
plot(hmda_model_df$loan_granted, main="Loan granted Variable",
col=colors()[100:102],
xlab="Loan distribution")
skew <- paste("Skewness:",skewness(hmda_model_df$loan_amount_000s,na.rm = TRUE))
ggplot(data = hmda_model_df , aes(x = loan_amount_000s)) + geom_histogram(fill = "steelblue") + labs(title = "Loan amount distribution" , x = "Loan amount in thousands" , y = "Count")+ annotate("text", x = 100000, y = 300000, size = 3.2,label = skew)
Looks like the data is highly skewed.
#install.packages("moments")
library(moments)
skewness(hmda_model_df$loan_amount_000s,na.rm = TRUE)
The data for loan amount is highly right skewed. Changes should be made so that the prediction model does not mess up.
skew <- paste("Skewness:",skewness(log(hmda_model_df$loan_amount_000s),na.rm = TRUE))
ggplot(data = hmda_model_df , aes(x = log(loan_amount_000s))) + geom_histogram(fill = "steelblue") + labs(title = "Log transformed distribution for Loan amount" , x = "log(Loan Amount)", y = 'Count')+ annotate("text", x = 8, y = 100000, size = 3.2,label = skew)
skewness(log(hmda_model_df$loan_amount_000s),na.rm = TRUE)
boxplot(log(hmda_model_df$loan_amount_000s),col = colors()[100:109],
main = "Boxplot of Log of Loan Amounts",
xlab="Loan Amount",
ylab="Distribution of Log of Loan Amounts")
skew <- paste("Skewness:",skewness(hmda_model_df$applicant_income_000s,na.rm = TRUE))
ggplot(data = hmda_model_df , aes(x = applicant_income_000s)) + geom_histogram(fill = "steelblue") + labs(title = "Applicant Income distribution" , x = "Applicant Income in thousands" , y = "Count") + annotate("text", x = 100000, y = 90000, size = 3.2,label = skew)
skew <- paste("Skewness:",skewness(log(hmda_model_df$applicant_income_000s),na.rm=TRUE))
ggplot(data = hmda_model_df , aes(x = log(applicant_income_000s))) + geom_histogram(fill = "steelblue") + labs(title = "Log transformed distribution for Applicant Income" , x = "log(Applicant Income)", y = 'Count') +annotate("text", x = 10, y = 90000, size = 3.2,label = skew)
boxplot(log(loan_amount_000s)~loan_granted, xlab="Loan decision",ylab="Log of Loan Amounts",col=c("pink","lightblue"),
main="Exploratory Data Analysis Plot\n of Loan Decision Versus Log of Loan Amounts", data = hmda_model_df)
boxplot(log(applicant_income_000s)~loan_granted, xlab="Loan decision",ylab="Log of Applicant Income",col=c("pink","lightblue"),
main="Exploratory Data Analysis Plot\n of Loan Decision Versus Log of Applicant Income", data = hmda_model_df)
ggplot(hmda_model_df, aes(log(applicant_income_000s), applicant_race_and_ethnicity, color = loan_granted)) +
geom_jitter() +
ggtitle("Log of Applicant income vs. Applicant race and ethnicity , by color = Loan decision") +
theme_light()
ggplot(hmda_model_df, aes(log(loan_amount_000s), applicant_race_and_ethnicity, color = loan_granted)) +
geom_jitter() +
ggtitle("Log of loan amount vs. Applicant race and ethnicity , by color = Loan decision") +
theme_light()
ggplot(hmda_model_df, aes(loan_to_income_ratio, applicant_race_and_ethnicity, color = loan_granted)) +
geom_jitter() +
ggtitle("Loan to Income ratio vs. Applicant race and ethnicity , by color = Loan decision") +
theme_light()
write.csv(hmda_data_pa_df_imputed, paste(data_dir, "/2014/hmda_2014_pa_imputed.csv", sep = ""), row.names = FALSE)